// Copyright (C) 2003
// Gerhard Neumann (gneumann@gmx.net)
// Stephan Neumann (sneumann@gmx.net) 
//                
// This file is part of RL Toolbox.
// http://www.igi.tugraz.at/ril_toolbox
//
// All rights reserved.
// 
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
// 3. The name of the author may not be used to endorse or promote products
//    derived from this software without specific prior written permission.
// 
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
// IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
// NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// USE OF THE EXAMPLE
// gridworldshortestpath [-g gridworldfile] [-d debugfile]
// 
// This example shows how to learn the shortest path problem in a given gridworld with 
// Q-Function learning.

#include <time.h>

#include "ril_debug.h"
#include "ctdlearner.h"
#include "cpolicies.h"
#include "cagent.h"
#include "cagentlogger.h"
#include "crewardmodel.h"
#include "canalyzer.h"
#include "cgridworldmodel.h"
#include "cvfunctionlearner.h"

 
// This is the entry point for this application


int main(int argc, char **argv)
{
	// Initialize the random generator 
	srand((unsigned int) time(NULL));

	int arg = 1;
	char *gridworldFileName = "Gridworld_10x10.txt";
	
	// Console Input Processing
	while (arg < argc - 1)
	{
		if (strcmp(argv[arg], "-d") == 0)
		{
			// The "-d" option enables debugging
			arg ++;
			char *debugFile = argv[arg];
			DebugInit(debugFile, "+", false);
		}
		if (strcmp(argv[arg], "-g") == 0)
		{
			// The "-g" option sets the gridworld file
			arg ++;
			gridworldFileName = argv[arg];
		}
		arg++;
	}

	printf ("-=<   Reinforcement Learning Example - Learning the Shortest Path Problem in a Gridworld   >=-\n\n");
	
	// Create the Gridworld, set max_bounces to 50. The model will reset the episode if "max_bounces" 
	// bounces (walk into the wall) are reached.
	CGridWorldModel *gridworldModel = new CGridWorldModel(gridworldFileName, 50);
	
	// Set the reward values 
	// -0.2 for each move (in order to learn the shortest path)
	// -5.0 for each bounce
	// 100 for reaching the target state(s)

	gridworldModel->setRewardStandard(-0.2);
	gridworldModel->setRewardBounce(-5.0);
	gridworldModel->setRewardSuccess(100.0);

	// Create the environment for the agent, the environment saves the current state of the agent.
	CEnvironmentModel *environmentModel = new CTransitionFunctionEnvironment(gridworldModel);
	
	// the gridworld model implements the reward function too, so we can use this
	CRewardFunction *rewardFunction = gridworldModel;
	
	// Create the agent in our environmentModel.
	CAgent *agent = new CAgent(environmentModel);

	// Add all possible Actions to the agent
	// left
	agent->addAction(new CGridWorldAction(-1,0));
	// right
	agent->addAction(new CGridWorldAction(1,0));
	// up
	agent->addAction(new CGridWorldAction(0,-1));
	// down
	agent->addAction(new CGridWorldAction(0,1));

	// For the shortest path problem, we need a global state, i.e. each possible position in the grid is an own state
	CAbstractStateDiscretizer *globalGridworldstate = new CGlobalGridWorldDiscreteState(gridworldModel->getSizeX(), gridworldModel->getSizeY());
	
	// In order to use the discretizer we have to add it to the agent's state modifier list. 
	// Always add your state modifiers to that list !!
	agent->addStateModifier(globalGridworldstate);
	
	// Create an Agent Logger for logging the episodes
	// Our agent logger logs the gridworld model state and the actions of the agent. This logger holds all episodes in memory.
	CAgentLogger *logger = new CAgentLogger(gridworldModel->getStateProperties(), agent->getActions());
	// add the logger to the agent's listener list
	agent->addSemiMDPListener(logger);

	// Q-Learning starts here

	// Create our Q-Function, we will use a Feature Q-Function, which is table-like representation of the Q-Function.
	// The Q-Function needs to know which actions and which state it has to use
	CFeatureQFunction *qFunction = new CFeatureQFunction(agent->getActions(), globalGridworldstate);

	// Create the Q-Function learner, we will use a SarsaLearner
	// The Sarsa Learner needs the reward function, the Q-Function and the agent.
	// The agent is used to get the estimation policy, because Sarsa Learning is On-Policy learning.
	CSarsaLearner *qFunctionLearner = new CSarsaLearner(rewardFunction, qFunction, agent);

	// Create the Controller for the agent from the QFunction. We will use a EpsilonGreedy-Policy for exploration.
	CAgentController *qLearnerPolicy = new CQStochasticPolicy(agent->getActions(), new CEpsilonGreedyDistribution(0.1), qFunction);
		
	// Set some options of the Etraces which are not default
	qFunctionLearner->setParameter("ReplacingETraces", 1.0);
	qFunctionLearner->setParameter("Lambda", 0.95);
	
	// Add the learner to the agent listener list, so he can learn from the agent's steps.
	agent->addSemiMDPListener(qFunctionLearner);
	
	// Set the controller of the agent
	agent->setController(qLearnerPolicy);
	
	// Disable logging of the current Episode
	agent->setLogEpisode(false);

	int steps = 0; 
    int ges_failed = 0, ges_succeeded = 0, last_succeeded = 0;
    
    int totalSteps = 0;

	// Start Learning, Learn 50 Episodes
	for (int i = 0; i < 200; i++)
	{
		// Start a new Episode, the agent gets reseted in one of the start states
		agent->startNewEpisode();
		// Learn 1 Episode with maximal 1000 steps 
		steps = agent->doControllerEpisode(1, 1000);

		totalSteps += steps;

		// Check if the Episode failed
		// The episode has failed if max_bounces has been reached (indicated through environmentModel->isFailed()), 
		// or max_steps has been reached
		if (environmentModel->isFailed() || steps >= 1000)
		{
			ges_failed++;
			last_succeeded = 0;
			printf("Episode %d failed with %d steps\n", i, steps);
		}
		else
		{
			ges_succeeded++;
			last_succeeded++;

			printf("Episode %d succeded with %d steps, %d Episodes succeded in the row\n", i, steps, last_succeeded);
		}
		
	}

	// Save the QFunction
	FILE *qFuncFile = fopen("QFunctionShortestPath.table","w");
	qFunction->saveData(qFuncFile);
	fclose(qFuncFile);

	// Save the QFunction
	FILE *loggerFile = fopen("gridworldShortestPath.episodes","w");
	logger->saveData(loggerFile);
	fclose(loggerFile);

	printf("\n\n<< Press Enter >>\n");
	getchar();

	// Cleaning Up
	
	delete qFunction;
	delete qLearnerPolicy;
	delete qFunctionLearner;
	delete logger;
	delete agent;
	delete environmentModel;
	delete gridworldModel;
}

